options(future.globals.maxSize = 2 * 1024 * 10E6);
.cran_libs <- c("purrr", "jsonlite", "httr", "summarytools", "munsell", "cachem"
, "SmartEDA", "htmltools", "slider", "stringi", "magrittr", "plotly"
, "DT", "data.table", "pdftools", "lubridate", "future", "furrr", "future.callr")
.git_libs <- paste0("book.of.", c("utilities", "features", "workflow")) |> c("architect")
if (!"book.of.workflow" %in% dir(sprintf("%s/library", R.home()))){
if (!"remotes" %in% dir(sprintf("%s/library", R.home()))){
htmltools::tags$span(style = "text-deoration:italic; color:#333333; ", "... installing missing package 'remotes'")
install.packages("remotes", repos = "https://cloud.r-project.org")
}
htmltools::tags$span(
style = "text-deoration:italic; color:#333333; "
, "... installing missing packages 'book.of.utilities', 'book.of.features', 'book.of.workflow', 'architect'"
);
remotes::install_github("delriaan/book.of.utilities", subdir = "pkg")
remotes::install_github("delriaan/book.of.features", subdir = "pkg")
remotes::install_github("delriaan/book.of.workflow", subdir = "pkg")
remotes::install_github("delriaan/architect", subdir = "pkg")
}
library(book.of.workflow)
load_unloaded(!!!.cran_libs, autoinstall = TRUE)
load_unloaded(!!!.git_libs)
urls <- list(
data = list(
`Data Dictionary` = "https://www.medicaid.gov/medicaid-chip-program-information/by-topics/prescription-drugs/downloads/recordspecficationanddefinitions.pdf"
, Data = "https://download.medicaid.gov/data/drugproducts1q_2023.csv"
) |> imap(\(x, y) tags$li(tags$a(href = x, y))) |> tags$ol()
, git_libs = map(.git_libs, \(x) tags$li(tags$a(title = paste0("delriaan/", x), x))) |> list() |> tags$ul()
, openFDA = "https://download.open.fda.gov/drug/ndc/drug-ndc-0001-of-0001.json.zip"
)
.cache <- cache_disk(dir = "r_session_cache")
Data
Wrangling
Retrieve Data
# MDRP database data
if (rlang::is_empty(dir(params$data, pattern = "api_data[.][Rr]data$"))){
if (!"api_data" %in% ls()){
api_data <- urls$data |>
as.character() |>
stri_extract_all_regex("http.+csv", simplify = TRUE) |>
as.vector() %T>%
(\(x) tags$p(sprintf("Retrieve data from '%s'", x)) |> print())() |>
GET() |>
content() |>
rawToChar() |>
(\(x) read.csv(text = x, ))() |>
as.data.table(na.rm = FALSE) |>
modify_at(c(4:7), as.character) %>%
modify_at(ls(., pattern = "Date"), lubridate::mdy) %>%
setnames(stri_replace_all_fixed(names(.) |> tolower(), ".", "_"))
}
if (!"api_dictionary" %in% ls()){
api_dictionary <- invisible(urls$data |>
as.character() |>
stri_extract_all_regex("http.+pdf", simplify = TRUE) |>
as.vector() |>
GET() |>
content() |>
pdf_text())
.summary_labels <- names(api_data) |>
rlang::set_names() |>
map_chr(\(x) stri_replace_all_fixed(x, ".", " ", vectorize_all = FALSE)) |>
imap_chr(\(x, y){
api_dictionary |>
stri_extract_all_regex(
sprintf(
fmt = "(%s)[:]\n.+"
, stri_replace_all_fixed(
x
, c("Pkg"
, "Intro"
, "COD Status"
, "FDA Application Number"
, "FDA Therapeutic Equivalence Code"
)
, c("Package"
, "Intro."
, "Covered Outpatient Drug [(]COD[)] Status"
, "FDA Application Number/OTC Monograph Number"
, "TEC"
)
, vectorize_all = FALSE
)
)
, simplify = TRUE
) |>
stats::na.omit() |>
as.vector() |>
discard(\(x) x == "") %>%
(\(i){
.out <- ifelse(identical(character(), i), y, paste(i, collapse = "\n"))
ifelse(stri_length(.out) > 50, paste0(.out, " ..."), .out)
})()
});
iwalk(.summary_labels, \(x, y){
.label = x;
api_data <<- modify_at(api_data, y, \(i){ attr(i, "label") <- .label; i })
})
}
} else {
if (!"api_data" %in% ls()){
dir(params$data, pattern = "api_data[.][Rr]data$") |> load()
}
}
Retrieve data from 'https://download.medicaid.gov/data/drugproducts1q_2023.csv'
# openFDA supplementary data
if (rlang::is_empty(dir(params$data, pattern = "openFDA_ndc[.][Rr]data$"))){
if (!"openFDA_ndc" %in% ls()){ openFDA_ndc <- (\(x){
json.file <- "drug-ndc-0001-of-0001.json";
if (!json.file %in% dir()){
GET(urls$openFDA, write_disk(x, TRUE))
unzip(zipfile = x)
}
read_json(path = json.file);
})("ndc_json.zip") |> (\(x){
data.table::rbindlist(modify_at(x$results, 1:length(x$results), as.data.table), fill = TRUE) |>
setattr("metadata", x$meta)
})()
}
} else {
if (!"openFDA_ndc" %in% ls()){
dir(params$data, pattern = "openFDA_ndc") |> load()
}
}
Prepare Data
NDC sequences come in a various formats, usually a
4-4-x, 5-4-x, or 5-3-x sequence
(each integer indicating string length). Sometimes other formats arise,
so normalizing all NDC sequences is a good idea, especially when there
is a desire (or need) to join different data containing intersecting
NDCs.
The following shows the NDC sequences in the OpenFDA and
MDRP data:
I’ll create a function to check NDC formats and conform them in order
to join the two data sets using a conformed NDC sequence:
check_ndc_format <- \(lc, pc){
pc <- modify_if(pc, \(i) stri_length(i) < 3, \(i) stri_pad_left(i, width = 3, pad = "0"))
lc <- modify_if(
lc
, \(i) stri_length(i) < 4
, \(i) stri_pad_left(i, width = ifelse(stri_length(pc) == 3, 5, 4), pad = "0")
)
paste(lc, pc, sep = "-")
}
if (rlang::is_empty(dir(params$data, pattern = "master_drug_data[.][Rr]data$"))){
if (!"master_drug_data" %in% ls()){
master_drug_data <- (\(x, i) x[
i, on = "alt_ndc==product_ndc"
, nomatch = 0
, allow.cartesian = TRUE
, `:=`(pharm_class = pharm_class
, dea_schedule = dea_schedule
, product_type = product_type
, route = route
, marketing_category = marketing_category
)
, by = .EACHI
])(api_data[, alt_ndc := map2_chr(labeler_code, product_code, check_ndc_format)]
, openFDA_ndc)
master_drug_data[, `:=`(
pharm_class := map_chr(pharm_class, \(x) unlist(x) %||% "~")
, route := map_chr(route, \(x) unlist(x) %||% "~")
)]
}
} else {
if (!"master_drug_data" %in% ls()){
dir(params$data, pattern = "master_drug_data[.][Rr]data$", full.names = TRUE) |> load()
}
}
Temporal
Analysis
master_drug_data is a great dataset for constructing
simple, time-based metrics. Given the natural order of the types of
events, it is easy to setup event sequence metrics using package lubridate
if (rlang::is_empty(dir(pattern = "ndc_events[.][Rr]data$"))){
if (!"ndc_events" %in% ls()){
ndc_events <- define(
master_drug_data
, ~alt_ndc + fda_product_name + drug_category +
pharm_class + dea_schedule + product_type +
route + marketing_category + fda_approval_date +
market_date + termination_date + reactivation_date
, unique(.SD)
, days_to_market = duration(as.integer(market_date) - as.integer(fda_approval_date), units = "days")
, on_market_age = duration({
ifelse(is.na(termination_date), ifelse(is.na(reactivation_date), today(), reactivation_date), termination_date) -
ifelse(is.na(reactivation_date), ifelse(is.na(termination_date), market_date, termination_date), reactivation_date)
}, units = "days")
, days_market_absent = duration({
ifelse(is.na(reactivation_date), today(), reactivation_date) -
ifelse(is.na(termination_date), today(), termination_date)
}, units = "days")
);
rlang::set_names(
c("Days between approval \nand market release"
, "Days active on market"
, "Days most-recently \nabsent from market")
, c("days_to_market", "on_market_age", "days_market_absent")
)|>
iwalk(\(x, y) rlang::parse_expr(sprintf("setattr(ndc_events$%s, \"label\", \"%s\")", y, x)) |> eval(envir = .GlobalEnv))
}
} else {
if (!"ndc_events" %in% ls()){
dir(pattern = "ndc_events[.][Rr]data$", full.names = TRUE) |> load()
}
}
---
title: "Medicaid Drug Rebate Program (MDRP) Database"
output: 
  html_notebook:
    code_folding: hide
params:
  data: data
---

```{r setup, warning=FALSE, message=FALSE}
options(future.globals.maxSize = 2 * 1024 * 10E6);

.cran_libs <- c("purrr", "jsonlite", "httr", "summarytools", "munsell", "cachem"
                , "SmartEDA", "htmltools", "slider", "stringi", "magrittr", "plotly"
                , "DT", "data.table", "pdftools", "lubridate", "future", "furrr", "future.callr")
.git_libs <- paste0("book.of.", c("utilities", "features", "workflow")) |> c("architect")
              
if (!"book.of.workflow" %in% dir(sprintf("%s/library", R.home()))){
  if (!"remotes" %in% dir(sprintf("%s/library", R.home()))){ 
    htmltools::tags$span(style = "text-deoration:italic; color:#333333; ", "... installing missing package 'remotes'")
    install.packages("remotes", repos = "https://cloud.r-project.org") 
  }
  
  htmltools::tags$span(
    style = "text-deoration:italic; color:#333333; "
    , "... installing missing packages 'book.of.utilities', 'book.of.features', 'book.of.workflow', 'architect'"
    );
  
  remotes::install_github("delriaan/book.of.utilities", subdir = "pkg")
  remotes::install_github("delriaan/book.of.features", subdir = "pkg")
  remotes::install_github("delriaan/book.of.workflow", subdir = "pkg")
  remotes::install_github("delriaan/architect", subdir = "pkg")
}

library(book.of.workflow)
load_unloaded(!!!.cran_libs, autoinstall = TRUE)
load_unloaded(!!!.git_libs)

urls <- list(
  data = list(
    `Data Dictionary` = "https://www.medicaid.gov/medicaid-chip-program-information/by-topics/prescription-drugs/downloads/recordspecficationanddefinitions.pdf"
    , Data = "https://download.medicaid.gov/data/drugproducts1q_2023.csv"
    ) |> imap(\(x, y) tags$li(tags$a(href = x, y))) |> tags$ol()
  , git_libs = map(.git_libs, \(x) tags$li(tags$a(title = paste0("delriaan/", x), x))) |> list() |> tags$ul()
  , openFDA = "https://download.open.fda.gov/drug/ndc/drug-ndc-0001-of-0001.json.zip"
  )

.cache <- cache_disk(dir = "r_session_cache")
```

#  {.tabset .tabset-fade .tabset-pills}

## Purpose &<br> Metadata 

This project explores the Medicaid Drug Rebate Program (MDRP) database via [API](https://data.medicaid.gov/dataset/0ad65fe5-3ad3-5d79-a3f9-7893ded7963a) calls (dataset description [here](https://www.medicaid.gov/medicaid/prescription-drugs/medicaid-drug-rebate-program/medicaid-drug-rebate-program-data/index.html)):

`r urls$data`

### Required LIbraries

```{r, echo=FALSE, cache=TRUE, cache.lazy=TRUE}
tags$table(
  style = "width:475px; "
  , tags$tr(
    tags$th(style = "text-align:middle; border: solid 2px #AAAAFF; background-color:#EEEEEE; ", "CRAN", colspan = 2, width="65%")
    , tags$th(style = "text-align:middle; border: solid 2px #AAAAFF; background-color:#AAAAAA; ", width = "*", "GitHub")
    )
  , tags$tr(
      style = "align:top"
      , tags$td(style = "text-align:middle; background-color:#AAAAAA; "
                , map(.cran_libs[1:5], \(x) tags$li(x)) |> list() |> tags$ul())
      , tags$td(style = "text-align:middle; background-color:#AAAAAA; "
                , map(.cran_libs[6:length(.cran_libs)], \(x) tags$li(x)) |> list() |> tags$ul())
      , tags$td(style = "background-color:#EEEEEE; padding-right:20px; ", urls$git_libs)
      )
  ) |> tags$p()
```

## Data <br>Wrangling 

### Retrieve Data

```{r RETRIEVE_DATA, cache=TRUE,cache.lazy=TRUE, warning=FALSE, message=FALSE}
# MDRP database data
if (rlang::is_empty(dir(params$data, pattern = "api_data[.][Rr]data$"))){
  if (!"api_data" %in% ls()){ 
    api_data <- urls$data |> 
      as.character() |> 
      stri_extract_all_regex("http.+csv", simplify = TRUE) |> 
      as.vector() %T>% 
      (\(x) tags$p(sprintf("Retrieve data from '%s'", x)) |> print())() |>
      GET() |>
      content() |>
      rawToChar() |>
      (\(x) read.csv(text = x, ))() |>
      as.data.table(na.rm = FALSE) |> 
      modify_at(c(4:7), as.character) %>% 
      modify_at(ls(., pattern = "Date"), lubridate::mdy) %>% 
      setnames(stri_replace_all_fixed(names(.) |> tolower(), ".", "_"))
  }
  
  if (!"api_dictionary" %in% ls()){ 
    api_dictionary <- invisible(urls$data |> 
      as.character() |> 
      stri_extract_all_regex("http.+pdf", simplify = TRUE) |> 
      as.vector() |>
      GET() |>
      content() |> 
      pdf_text())
    
    .summary_labels <- names(api_data) |> 
        rlang::set_names() |>
        map_chr(\(x) stri_replace_all_fixed(x, ".", " ", vectorize_all = FALSE)) |>
        imap_chr(\(x, y){ 
          api_dictionary |> 
            stri_extract_all_regex(
              sprintf(
                fmt = "(%s)[:]\n.+"
                , stri_replace_all_fixed(
                    x
                    , c("Pkg"
                        , "Intro"
                        , "COD Status"
                        , "FDA Application Number"
                        , "FDA Therapeutic Equivalence Code"
                        )
                    , c("Package"
                        , "Intro."
                        , "Covered Outpatient Drug [(]COD[)] Status"
                        , "FDA Application Number/OTC Monograph Number"
                        , "TEC"
                        )
                    , vectorize_all = FALSE
                    )
                )
              , simplify = TRUE
              ) |>
            stats::na.omit() |>
            as.vector() |>
            discard(\(x) x == "") %>%
            (\(i){ 
              .out <- ifelse(identical(character(), i), y, paste(i, collapse = "\n"))
              ifelse(stri_length(.out) > 50, paste0(.out, " ..."), .out)
            })()
        });
    
    iwalk(.summary_labels, \(x, y){ 
      .label = x; 
      api_data <<- modify_at(api_data, y, \(i){ attr(i, "label") <- .label; i }) 
    })
  }
} else { 
  if (!"api_data" %in% ls()){ 
    dir(params$data, pattern = "api_data[.][Rr]data$", full.names = TRUE) |> load() 
  }
}

# openFDA supplementary data
if (rlang::is_empty(dir(params$data, pattern = "openFDA_ndc[.][Rr]data$"))){
  if (!"openFDA_ndc" %in% ls()){ openFDA_ndc <- (\(x){ 
      json.file <- "drug-ndc-0001-of-0001.json";
    
      if (!json.file %in% dir()){ 
        tags$p(sprintf("Retrieve data from '%s'", urls$openFDA)) |> print()
        
        GET(urls$openFDA, write_disk(x, TRUE)) 
        unzip(zipfile = x)
      }
      
      read_json(path = json.file);
    })("ndc_json.zip") |> (\(x){
      data.table::rbindlist(modify_at(x$results, 1:length(x$results), as.data.table), fill = TRUE) |>
        setattr("metadata", x$meta)
    })()
  }
} else { 
  if (!"openFDA_ndc" %in% ls()){ 
    dir(params$data, pattern = "openFDA_ndc", full.names = TRUE) |> load() 
  }
}
```

### Prepare Data

NDC sequences come in a various formats, usually a `4-4-x`, `5-4-x`, or `5-3-x` sequence (each integer indicating string length).  Sometimes other formats arise, so normalizing all NDC sequences is a good idea, especially when there is a desire (or need) to join different data containing intersecting NDCs.

The following shows the NDC sequences in the *OpenFDA* and *MDRP* data:

```{r EXPLORATION, warning=FALSE, message=FALSE}
api_data[runif(n= nrow(api_data)) <= 0.33, !"id"] |>
  dfSummary(labels.col = TRUE) |> 
  view(method = "render", report.title = "Medicaid Drug Rebate Program (MDRP) Data") 


list(
  `OpenFDA: NDC Formats` = openFDA_ndc[, unique(product_ndc)] |> 
      sort() |> 
      stri_split_fixed("-") |> 
      map_chr(\(x) stri_length(x) |> paste(collapse = "-")) |> 
      table() |>
      # ratio(type = "of.sum", decimals = 6) |>
      as.data.table()
  , `MDRP: NDC Formats` = api_data[, paste(stri_length(labeler_code), stri_length(product_code), sep = "-")] |>
      sort() |> 
      table() |>
      # ratio(type = "of.sum", decimals = 6)|>
      as.data.table()
  ) |> 
  imap(\(x, y){ 
    plot_ly(  
      data = x
      , type = "pie"
      , labels = ~sprintf("[%s]", V1)
      , values = ~N
      , hole = 0.6
      , name = sprintf('NDC Format Frequency: %s', y)
      , textinfo='label+percent'
      , insidetextorientation='radial'
      )
  }) |>
  subplot() |>
      plotly::layout(
        xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE)
        , yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE)
        ) |>
  # tags$ul() |>
  tags$p()
```

I'll create a function to check NDC formats and conform them in order to join the two data sets using a conformed NDC sequence:

```{r}
check_ndc_format <- \(lc, pc){ 
  pc <- modify_if(pc, \(i) stri_length(i) < 3, \(i) stri_pad_left(i, width = 3, pad = "0"))
  lc <- modify_if(
          lc
          , \(i) stri_length(i) < 4
          , \(i) stri_pad_left(i, width = ifelse(stri_length(pc) == 3, 5, 4), pad = "0")
          )
  paste(lc, pc, sep = "-")
}
```


```{r JOIN_DRUG_DATA, warning=FALSE, message=FALSE}
if (rlang::is_empty(dir(params$data, pattern = "master_drug_data[.][Rr]data$"))){
  if (!"master_drug_data" %in% ls()){
    master_drug_data <- (\(x, i) x[
                    i, on = "alt_ndc==product_ndc"
                    , nomatch = 0
                    , allow.cartesian = TRUE
                    , `:=`(pharm_class = pharm_class
                           , dea_schedule = dea_schedule
                           , product_type = product_type
                           , route = route
                           , marketing_category = marketing_category
                           )
                    , by = .EACHI
                    ])(api_data[, alt_ndc := map2_chr(labeler_code, product_code, check_ndc_format)]
                      , openFDA_ndc)
  
    master_drug_data[, `:=`(
      pharm_class := map_chr(pharm_class, \(x) unlist(x) %||% "~")
      , route := map_chr(route, \(x) unlist(x) %||% "~")
      )]
  }
} else {
  if (!"master_drug_data" %in% ls()){ 
    dir(params$data, pattern = "master_drug_data[.][Rr]data$", full.names = TRUE) |> load()
  }
}
```


## Temporal<br>Analysis 

`master_drug_data` is a great dataset for constructing simple, time-based metrics.  Given the natural order of the types of events, it is easy to setup  event sequence metrics using package [`lubridate`](https://rdocumentation.org/packages/lubridate/versions/1.9.2)

```{r TIME_STUDY, warning=FALSE, message=FALSE}
if (rlang::is_empty(dir(pattern = "ndc_events[.][Rr]data$"))){ 
  if (!"ndc_events" %in% ls()){ 
    ndc_events <- define(
      master_drug_data
      , ~alt_ndc + fda_product_name + drug_category + 
          pharm_class + dea_schedule + product_type + 
          route + marketing_category + fda_approval_date + 
          market_date + termination_date + reactivation_date
      , unique(.SD)
      , days_to_market = duration(as.integer(market_date) - as.integer(fda_approval_date), units = "days")
      , on_market_age = duration({ 
            ifelse(is.na(termination_date), ifelse(is.na(reactivation_date), today(), reactivation_date), termination_date) - 
            ifelse(is.na(reactivation_date), ifelse(is.na(termination_date), market_date, termination_date), reactivation_date)
          }, units = "days")
      , days_market_absent = duration({ 
            ifelse(is.na(reactivation_date), today(), reactivation_date) - 
            ifelse(is.na(termination_date), today(), termination_date)
          }, units = "days")
      );
    
    rlang::set_names(
      c("Days between approval \nand market release"
          , "Days active on market"
          , "Days most-recently \nabsent from market")
      , c("days_to_market", "on_market_age", "days_market_absent")
      )|> 
      iwalk(\(x, y) rlang::parse_expr(sprintf("setattr(ndc_events$%s, \"label\", \"%s\")", y, x)) |> eval(envir = .GlobalEnv))
  }
} else {
  if (!"ndc_events" %in% ls()){ 
    dir(pattern = "ndc_events[.][Rr]data$", full.names = TRUE) |> load()
  }
}
```


```{r SAVE_DATA}
  book.of.workflow::save_image("openFDA_ndc", file.name = "openFDA_ndc"
                               , use.prefix = FALSE, use.timestamp = FALSE, safe = FALSE)
  
  book.of.workflow::save_image("api_data", "api_dictionary", file.name = "mdrp_api_data"
                               , use.prefix = FALSE, use.timestamp = FALSE, safe = FALSE)
  
  book.of.workflow::save_image("master_drug_data", file.name = "master_drug_data"
                               , use.prefix = FALSE, use.timestamp = FALSE, safe = FALSE)
  
  book.of.workflow::save_image("ndc_events", file.name = "ndc_events"
                               , use.prefix = FALSE, use.timestamp = FALSE, safe = FALSE)

```
